Compare cell type-specific gene expression signatures generated from snRNA-seq measured by SMART-seq vs. 10x 3’v3.
Both datasets measured gene expression of brain cell types from middle temporal gyrus (MTG). The SMART-seq data were published earlier (Hodge et al. 2019). The 10x data were downloaded from cellxgene collection of SEA-AD (Seattle Alzheimer’s Disease Brain Cell Atlas) data.
The SMART-seq were processed by codes saved at our GitHub repo scRNAseq_pipelines. We used different clustering methods to examine whether the clustering results are consistent with cell type annotation, and choose those cells whose cluster membership is consistent with cell type annotation to generate gene expression signature.
The 10x data were processed by code step1_SEA_AD.py in
this folder. We simply calculated two cell type-specific gene expression
measures for each gene. One is the summation of UMI counts across all
the cells of one cell type, and the other is mean value of read-depth
normalized UMIs.
git_dir is the local dir for the Github repo
scRNAseq_pipelines.tenx_dir is the local dir where saved the 10x gene
expressio data generated by step1_SEA_AD.py.library(data.table)
library(stringr)
library(org.Hs.eg.db)
library(ICeDT)
library(ggplot2)
library(ggpubr)
library(viridis)
library(ggpointdensity)
library(RColorBrewer)
theme_set(theme_classic())
git_dir = "~/research/GitHub/scRNAseq_pipelines"
tenx_dir = "~/research/data/SEA-AD"
dim1 <- function(x){if(is.vector(x)) length(x) else dim(x)}
is.uique <- function(x){ length(x) == length(unique(x))}The data were generated by codes in our Github repo
scRNAseq_pipelines. Need to read the expression of all the
genes in order to calculate TPM.
rds = readRDS(file.path(git_dir, "_brain_cell_type/step3_DE_output",
"all_genes_MTG.rds"))
lapply(rds, dim1)## $anno
## [1] 37643 13
##
## $SIG
## [1] 37643 6
##
## $cor_between_cell_types
## [1] 1
##
## $cor_between_cell_type_and_gene_length
## [1] 1
##
## $mean_log_gene_length
## [1] 1
##
## $sd_log_gene_length
## [1] 1
sm2_data = rds$SIG
sm2_anno = rds$anno
rm(rds)
rds = readRDS(file.path(git_dir, "_brain_cell_type/step3_DE_output",
"signature_MTG.rds"))
lapply(rds, dim1)## $anno
## [1] 693 10
##
## $SIG
## [1] 693 6
sm2_markers = rownames(rds$SIG)
rm(rds)
cts = readRDS(file.path(git_dir, "MTG/ct_matrix_human_MTG.rds"))
dim(cts)## [1] 37657 7
cts[1:2,] ## Inh Exc Oligo OPC Astro Micro Endo
## 5-HT3C2 1615 2610 78 296 16 3 0
## A1BG 16134 54716 6381 0 55 245 0
cts = data.frame(cts)
cts$gene = rownames(cts)
sm2_df = data.frame(sm2_data)
sm2_df$gene = rownames(sm2_data)
sm2_df = merge(sm2_df, cts, by="gene", suffixes=c("", "_count"))
dim(sm2_df)## [1] 37643 14
sm2_df[1:2,]## gene Astro Exc Inh Micro Oligo OPC
## 1 5-HT3C2 0.05574913 0.2499282 0.3910412 0.0483871 0.2516129 1.264957
## 2 A1BG 0.19163763 5.2354799 3.9065375 3.9516129 20.5838710 0.000000
## Inh_count Exc_count Oligo_count OPC_count Astro_count Micro_count Endo
## 1 1615 2610 78 296 16 3 0
## 2 16134 54716 6381 0 55 245 0
cor(log10(sm2_df$Exc_count), log10(sm2_df$Exc))## [1] 0.9998821
smrt_df = cts[,c(8,1:7)]
dim(smrt_df)## [1] 37657 8
smrt_df[1:2,] ## gene Inh Exc Oligo OPC Astro Micro Endo
## 5-HT3C2 5-HT3C2 1615 2610 78 296 16 3 0
## A1BG A1BG 16134 54716 6381 0 55 245 0
files = list.files(tenx_dir, pattern="\\.csv")
length(files)## [1] 12
inh_cts = c("Lamp5", "Pvalb", "Sst", "Vip")
exc_cts = c("L2_3_IT", "L4_IT", "L5_IT", "L6_IT")
cell_types = gsub("_genes_info.csv", "", files, fixed = TRUE)
cell_types## [1] "Astro" "L2_3_IT" "L4_IT" "L5_IT" "L6_IT" "Lamp5"
## [7] "Micro-PVM" "Oligo" "OPC" "Pvalb" "Sst" "Vip"
dat_10x = list()
for(i in 1:length(files)){
f1 = files[i]
ct = cell_types[i]
if(ct %in% inh_cts){ ct = paste0("Inh_", ct) }
if(ct %in% exc_cts){ ct = paste0("Exc_", ct) }
dat_10x[[ct]] = fread(file.path(tenx_dir, f1))
if(i == 1){
tenx_df = dat_10x[[ct]][,c("gene_ids", "feature_name")]
}else{
stopifnot(all(tenx_df$gene_ids == dat_10x[[ct]]$gene_ids))
}
tenx_df[[ct]] = dat_10x[[ct]]$total_counts
}
dim(tenx_df)## [1] 36517 14
tenx_df[1:2,]## gene_ids feature_name Astro Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT
## 1: ENSG00000000003 TSPAN6 8310 6458 3049 3142 1146
## 2: ENSG00000000005 TNMD 174 1104 496 496 70
## Inh_Lamp5 Micro-PVM Oligo OPC Inh_Pvalb Inh_Sst Inh_Vip
## 1: 343 94 373 2574 2222 463 3149
## 2: 23 1 8 89 225 65 210
L_23 = dat_10x$Exc_L2_3_IT
dim(L_23)## [1] 36517 11
L_23[1:2,]## gene_ids feature_is_filtered feature_name feature_reference
## 1: ENSG00000000003 FALSE TSPAN6 NCBITaxon:9606
## 2: ENSG00000000005 FALSE TNMD NCBITaxon:9606
## feature_biotype n_cells_by_counts mean_counts pct_dropout_by_counts
## 1: gene 6086 0.019564658 98.15623
## 2: gene 1024 0.003344593 99.68978
## total_counts mean_rd_normalized_count var_rd_normalized_count
## 1: 6458 0.006393405 0.0034943754
## 2: 1104 0.001037519 0.0005624617
L_23 = L_23[which(L_23$pct_dropout_by_counts <= 98),]
dim(L_23)## [1] 19530 11
ggplot(L_23, aes(x=log10(total_counts),
y=log10(mean_rd_normalized_count))) +
geom_point(alpha=0.5, color="orange")ggplot(L_23, aes(x=log10(mean_rd_normalized_count),
y=log10(var_rd_normalized_count))) +
geom_point(alpha=0.5, color="orange")gtf_fn = file.path(git_dir, "MTG/GTF2/gene_info_v26.rds")
gtf = readRDS(gtf_fn)
gtf[1:2,]## Chr V2 V3 Start End V6 V7 V8 ENSG GENE
## 1 chr1 HAVANA gene 11869 14403 . + . ENSG00000223972.5 DDX11L1
## 7 chr1 HAVANA gene 14410 29553 . - . ENSG00000227232.5 WASH7P
exon_fn = file.path(git_dir,
"MTG/GTF2/exon_by_genes_gencode_v26.rds")
exon = readRDS(exon_fn)
dim(gtf)## [1] 56292 10
length(exon)## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## [1] 56292
stopifnot(setequal(names(exon), gtf$ENSG))
gtf = gtf[match(names(exon), gtf$ENSG),]
gene_len = sum(BiocGenerics::width(GenomicRanges::reduce(exon)))
gtf$gene_length = as.numeric(gene_len)
rm(exon)
gtf = gtf[which(gtf$Chr %in% paste0("chr",1:22)),]
gtf$ChrNum = as.integer(gsub("chr","",gtf$Chr))
gtf = gtf[order(gtf$ChrNum, gtf$Start),]
gtf$ENSG_version = gtf$ENSG
gtf$ENSG = gsub("\\..*","", gtf$ENSG_version)
dim(gtf)## [1] 53288 13
gtf[1:2,]## Chr V2 V3 Start End V6 V7 V8 ENSG GENE gene_length
## 1 chr1 HAVANA gene 11869 14403 . + . ENSG00000223972 DDX11L1 1729
## 7 chr1 HAVANA gene 14410 29553 . - . ENSG00000227232 WASH7P 1328
## ChrNum ENSG_version
## 1 1 ENSG00000223972.5
## 7 1 ENSG00000227232.5
gene_anno = as.data.frame(sm2_anno[,c("gene", "chromosome",
"entrez_id")])
dim(gene_anno)## [1] 37643 3
gene_anno[1:2,]## gene chromosome entrez_id
## 5-HT3C2 5-HT3C2 3 389180
## A1BG A1BG 19 1
length(unique(gene_anno$gene))## [1] 37643
map1 = mapIds(org.Hs.eg.db, keys=as.character(gene_anno$entrez_id),
column='ENSEMBL', keytype='ENTREZID')## 'select()' returned 1:many mapping between keys and columns
map1[1:5]## 389180 1 503538 29974
## NA "ENSG00000121410" "ENSG00000268895" "ENSG00000148584"
## 2
## "ENSG00000175899"
stopifnot(names(map1) == gene_anno$entrez_id)
gene_anno$ENSG = as.character(map1)
table(is.na(gene_anno$ENSG))##
## FALSE TRUE
## 24529 13114
gene_anno = gene_anno[which(!is.na(gene_anno$ENSG)),]
t1 = table(gene_anno$ENSG)
table(t1)## t1
## 1 2 3
## 24395 64 2
t1[t1 > 2]##
## ENSG00000186645 ENSG00000223972
## 3 3
gene_anno[gene_anno$ENSG == names(t1[t1 > 2])[1],]## gene chromosome entrez_id ENSG
## SPDYE10P SPDYE10P 7 643862 ENSG00000186645
## SPDYE12P SPDYE12P 7 100101268 ENSG00000186645
## SPDYE17 SPDYE17 7 102723849 ENSG00000186645
non_uniq_ENSG = names(t1)[t1 > 1]
gene_anno = gene_anno[-which(gene_anno$ENSG %in% non_uniq_ENSG),]
dim(gene_anno)## [1] 24395 4
gene_anno[1:2,]## gene chromosome entrez_id ENSG
## A1BG A1BG 19 1 ENSG00000121410
## A1BG-AS1 A1BG-AS1 19 503538 ENSG00000268895
table(gene_anno$ENSG %in% gtf$ENSG)##
## FALSE TRUE
## 1371 23024
gene_anno = merge(gene_anno, gtf, by="ENSG")
gene_anno = gene_anno[,c("ENSG", "gene", "Chr", "Start", "End", "gene_length")]
dim(gene_anno)## [1] 23024 6
gene_anno[1:2,]## ENSG gene Chr Start End gene_length
## 1 ENSG00000000419 DPM1 chr20 50934867 50958555 1207
## 2 ENSG00000000457 SCYL3 chr1 169849631 169894267 5586
smrt_df = merge(smrt_df, gene_anno, by="gene")
dim(smrt_df)## [1] 23024 13
smrt_df[1:2,]## gene Inh Exc Oligo OPC Astro Micro Endo ENSG Chr
## 1 A1BG 16134 54716 6381 0 55 245 0 ENSG00000121410 chr19
## 2 A1BG-AS1 2760 8445 359 1 1 1 0 ENSG00000268895 chr19
## Start End gene_length
## 1 58345178 58353499 2629
## 2 58347751 58354356 1411
cts_smrt = c("Astro", "Exc", "Inh", "Micro", "Oligo", "OPC")
cr1 = cor(smrt_df$gene_length, smrt_df[,cts_smrt], method="spearman")
cr2 = cor(smrt_df$gene_length, smrt_df[,cts_smrt]/smrt_df$gene_length,
method="spearman")
glist = list()
for(ct_i in cts_smrt){
glist[[ct_i]] = ggplot(smrt_df,
aes_string(x="log10(gene_length)",
y=sprintf("log10(%s)", ct_i))) +
geom_pointdensity() + scale_color_viridis() +
ggtitle(sprintf("%s, Sp. corr: %.2f/%.2f", ct_i,
cr1[,ct_i], cr2[,ct_i])) +
theme(legend.position = "none")
}
ggarrange(plotlist=glist, nrow = 2, ncol = 3)dim(tenx_df)## [1] 36517 14
cols2kp = c("ENSG", "Chr", "Start", "End", "gene_length")
tenx_df = merge(tenx_df, gtf[,cols2kp], by.x="gene_ids", by.y="ENSG")
names(tenx_df)[1:2] = c("ENSG", "gene")
names(tenx_df)[which(names(tenx_df) == "Micro-PVM")] = "Micro"
dim(tenx_df)## [1] 32320 18
tenx_df[1:2,]## ENSG gene Astro Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT
## 1: ENSG00000000419 DPM1 32612 550753 205876 213916 110076
## 2: ENSG00000000457 SCYL3 19382 241470 81828 113964 45380
## Inh_Lamp5 Micro Oligo OPC Inh_Pvalb Inh_Sst Inh_Vip Chr Start
## 1: 27030 9424 31439 11798 90418 32660 67887 chr20 50934867
## 2: 15370 7634 16463 10702 40361 16518 32396 chr1 169849631
## End gene_length
## 1: 50958555 1207
## 2: 169894267 5586
cts_10x = names(tenx_df)[3:14]
cts_10x## [1] "Astro" "Exc_L2_3_IT" "Exc_L4_IT" "Exc_L5_IT" "Exc_L6_IT"
## [6] "Inh_Lamp5" "Micro" "Oligo" "OPC" "Inh_Pvalb"
## [11] "Inh_Sst" "Inh_Vip"
glist = list()
for(i in 1:length(cts_10x)){
ct1 = cts_10x[i]
cr1 = cor(tenx_df$gene_length, tenx_df[[ct1]], method="spearman")
cr2 = cor(tenx_df$gene_length, tenx_df[[ct1]]/tenx_df$gene_length,
method="spearman")
glist[[i]] = ggplot(tenx_df, aes_string(x="log10(gene_length)",
y=sprintf("log10(%s)", ct1))) +
geom_pointdensity() + scale_color_viridis() +
ggtitle(sprintf("%s, Sp. corr: %.2f/%.2f", ct_i, cr1, cr2)) +
theme(legend.position = "none")
}
ggarrange(plotlist=glist, nrow = 4, ncol = 3)theme1 = theme(axis.text.x = element_text(angle = 60, vjust = 1,
size = 11, hjust = 1),
axis.text.y = element_text(size = 11))
stopifnot(is.uique(smrt_df$ENSG))
stopifnot(is.uique(tenx_df$ENSG))
dat_combined = merge(smrt_df, tenx_df, by="ENSG",
suffixes = c("","_10x"))
dim(dat_combined)## [1] 19179 30
dat_combined[1:2,]## ENSG gene Inh Exc Oligo OPC Astro Micro Endo Chr
## 1 ENSG00000000419 DPM1 301355 683217 16911 13857 11489 1944 925 chr20
## 2 ENSG00000000457 SCYL3 55202 145707 3320 3051 6389 1091 0 chr1
## Start End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 1 50934867 50958555 1207 DPM1 32612 550753 205876
## 2 169849631 169894267 5586 SCYL3 19382 241470 81828
## Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 1 213916 110076 27030 9424 31439 11798 90418 32660
## 2 113964 45380 15370 7634 16463 10702 40361 16518
## Inh_Vip Chr_10x Start_10x End_10x gene_length_10x
## 1 67887 chr20 50934867 50958555 1207
## 2 32396 chr1 169849631 169894267 5586
table(dat_combined$gene == dat_combined$gene_10x)##
## FALSE TRUE
## 2256 16923
cts_10x_adj = cts_10x
ww1 = which(cts_10x %in% cts_smrt)
cts_10x_adj[ww1] = paste0(cts_10x[ww1], "_10x")
cr_mat = cor(dat_combined[,cts_smrt], dat_combined[,cts_10x_adj],
method="spearman")
round(cr_mat, 2)## Astro_10x Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x
## Astro 0.84 0.64 0.65 0.65 0.65 0.68 0.67
## Exc 0.72 0.86 0.86 0.86 0.86 0.82 0.69
## Inh 0.73 0.80 0.81 0.81 0.81 0.85 0.70
## Micro 0.57 0.53 0.53 0.53 0.53 0.55 0.76
## Oligo 0.69 0.66 0.67 0.66 0.66 0.69 0.68
## OPC 0.75 0.70 0.71 0.71 0.71 0.74 0.69
## Oligo_10x OPC_10x Inh_Pvalb Inh_Sst Inh_Vip
## Astro 0.73 0.74 0.67 0.67 0.67
## Exc 0.75 0.76 0.82 0.83 0.81
## Inh 0.76 0.78 0.85 0.86 0.85
## Micro 0.61 0.57 0.54 0.55 0.54
## Oligo 0.85 0.72 0.68 0.68 0.68
## OPC 0.76 0.85 0.73 0.74 0.74
length(sm2_markers)## [1] 693
sm2_markers[1:4]## [1] "ABAT" "ABCA2" "ABCA8" "ABCC4"
table(sm2_markers %in% dat_combined$gene)##
## FALSE TRUE
## 23 670
dat_markers = dat_combined[which(dat_combined$gene %in% sm2_markers),]
dim(dat_markers)## [1] 670 30
dat_markers[1:2,]## ENSG gene Inh Exc Oligo OPC Astro Micro Endo Chr
## 23 ENSG00000002746 HECW1 58969 575538 314 6366 1604 0 0 chr7
## 30 ENSG00000003147 ICA1 898864 2591450 334 36952 164 2096 495 chr7
## Start End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 23 43112599 43566001 13535 HECW1 20254 7998995 4085283
## 30 8113184 8262579 3968 ICA1 5292 2037354 662684
## Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 23 4809762 1606772 106715 10481 24589 168726 949779 129909
## 30 796024 379456 131221 14058 7838 30508 406668 155240
## Inh_Vip Chr_10x Start_10x End_10x gene_length_10x
## 23 435460 chr7 43112599 43566001 13535
## 30 175158 chr7 8113184 8262579 3968
cr_mat = cor(dat_markers[,cts_smrt], dat_markers[,cts_10x_adj],
method="spearman")
colnames(cr_mat) = gsub("_10x", "", colnames(cr_mat))
cr_mat = cr_mat[,order(colnames(cr_mat))]
round(cr_mat, 2)## Astro Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT Inh_Lamp5 Inh_Pvalb
## Astro 0.85 0.27 0.25 0.24 0.26 0.25 0.26
## Exc 0.38 0.88 0.89 0.89 0.88 0.67 0.68
## Inh 0.36 0.64 0.66 0.65 0.63 0.85 0.81
## Micro -0.09 0.01 0.01 0.01 0.01 -0.05 -0.03
## Oligo 0.27 0.37 0.38 0.36 0.38 0.36 0.36
## OPC 0.42 0.46 0.46 0.44 0.44 0.53 0.51
## Inh_Sst Inh_Vip Micro Oligo OPC
## Astro 0.22 0.27 0.04 0.24 0.35
## Exc 0.70 0.69 0.23 0.47 0.51
## Inh 0.86 0.87 0.14 0.45 0.57
## Micro -0.06 -0.07 0.75 0.05 -0.12
## Oligo 0.38 0.37 0.08 0.86 0.37
## OPC 0.50 0.54 0.08 0.45 0.87
melted_cr_mat = reshape2::melt(cr_mat, value.name = "corr",
varnames=c("cell type SMART-seq",
"cell type 10x"))
dim(melted_cr_mat)## [1] 72 3
melted_cr_mat[1:2,]## cell type SMART-seq cell type 10x corr
## 1 Astro Astro 0.8540557
## 2 Exc Astro 0.3792698
ggplot(data = melted_cr_mat,
aes(x=`cell type 10x`, y=`cell type SMART-seq`, fill=corr)) +
geom_tile() + theme_classic() +
scale_fill_gradient(low = "white", high = "red") + theme1exc_cts = c("L2_3_IT", "L4_IT", "L5_IT", "L6_IT")
inh_cts = c("Lamp5", "Pvalb", "Sst", "Vip")
dat_combined[["Exc_10x"]] = rowSums(dat_combined[,paste0("Exc_", exc_cts)])
dat_combined[["Inh_10x"]] = rowSums(dat_combined[,paste0("Inh_", inh_cts)])
cts_10x_combined = paste0(cts_smrt, "_10x")
dat_markers = dat_combined[which(dat_combined$gene %in% sm2_markers),]
dim(dat_markers)## [1] 670 32
dat_markers[1:2,]## ENSG gene Inh Exc Oligo OPC Astro Micro Endo Chr
## 23 ENSG00000002746 HECW1 58969 575538 314 6366 1604 0 0 chr7
## 30 ENSG00000003147 ICA1 898864 2591450 334 36952 164 2096 495 chr7
## Start End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 23 43112599 43566001 13535 HECW1 20254 7998995 4085283
## 30 8113184 8262579 3968 ICA1 5292 2037354 662684
## Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 23 4809762 1606772 106715 10481 24589 168726 949779 129909
## 30 796024 379456 131221 14058 7838 30508 406668 155240
## Inh_Vip Chr_10x Start_10x End_10x gene_length_10x Exc_10x Inh_10x
## 23 435460 chr7 43112599 43566001 13535 18500812 1621863
## 30 175158 chr7 8113184 8262579 3968 3875518 868287
cr_mat = cor(dat_markers[,cts_smrt], dat_markers[,cts_10x_combined],
method="spearman")
colnames(cr_mat) = gsub("_10x", "", colnames(cr_mat))
cr_mat = cr_mat[,order(colnames(cr_mat))]
round(cr_mat, 2)## Astro Exc Inh Micro Oligo OPC
## Astro 0.85 0.25 0.25 0.04 0.24 0.35
## Exc 0.38 0.90 0.68 0.23 0.47 0.51
## Inh 0.36 0.65 0.89 0.14 0.45 0.57
## Micro -0.09 0.00 -0.08 0.75 0.05 -0.12
## Oligo 0.27 0.36 0.35 0.08 0.86 0.37
## OPC 0.42 0.46 0.53 0.08 0.45 0.87
melted_cr_mat = reshape2::melt(cr_mat, value.name = "corr",
varnames=c("cell type SMART-seq",
"cell type 10x"))
dim(melted_cr_mat)## [1] 36 3
melted_cr_mat[1:2,]## cell type SMART-seq cell type 10x corr
## 1 Astro Astro 0.8540557
## 2 Exc Astro 0.3792698
ggplot(data = melted_cr_mat,
aes(x=`cell type 10x`, y=`cell type SMART-seq`, fill=corr)) +
geom_tile() + theme_classic() +
scale_fill_gradient(low = "white", high = "red") + theme1for(ct1 in cts_smrt){
g1 = ggplot(dat_combined, aes_string(x=sprintf("log10(%s)", ct1),
y=sprintf("log10(%s_10x)", ct1))) +
geom_point(alpha=0.5, color="orange") +
geom_abline(intercept = 0, slope=1, col="red")
print(g1)
}bulk = readRDS("../../CSeQTL/s7_gtexBrain/counts/trec_hap1_hap2_QCres.rds")
sapply(bulk, dim1)## $trec
## [1] 56200 174
##
## $hap1
## [1] 56200 174
##
## $hap2
## [1] 56200 174
##
## $filt
## [1] 3
##
## $rm_subj
## [1] 1
bulk = bulk$trec
dim(bulk)## [1] 56200 174
bulk[1:2,1:3]## GTEX-1192X GTEX-11DYG GTEX-11DZ1
## ENSG00000000003.14 337 365 232
## ENSG00000000005.5 9 6 7
rownames(bulk) = gsub("\\..*", "", rownames(bulk))
dim(dat_combined)## [1] 19179 32
table(dat_combined$ENSG %in% rownames(bulk))##
## TRUE
## 19179
bulk = bulk[match(dat_combined$ENSG, rownames(bulk)),]
dim(bulk)## [1] 19179 174
num_genes_tpm = nrow(bulk); num_genes_tpm## [1] 19179
get_tpm <- function(dat_bulk, gene_length){
dat_tpm = apply(dat_bulk,2,function(xx) xx / gene_length)
dat_tpm = 1e6 * apply(dat_tpm,2,function(xx) xx / sum(xx))
dat_tpm
}
bulk_tpm = get_tpm(bulk, dat_combined$gene_length)
dim(bulk)## [1] 19179 174
dim(bulk_tpm)## [1] 19179 174
bulk[1:2,1:3]## GTEX-1192X GTEX-11DYG GTEX-11DZ1
## ENSG00000000419 879 719 421
## ENSG00000000457 342 366 151
bulk_tpm[1:2,1:3]## GTEX-1192X GTEX-11DYG GTEX-11DZ1
## ENSG00000000419 99.651727 73.235920 89.624590
## ENSG00000000457 8.377769 8.055318 6.945898
dim(dat_combined)## [1] 19179 32
dat_combined[1:2,]## ENSG gene Inh Exc Oligo OPC Astro Micro Endo Chr
## 1 ENSG00000000419 DPM1 301355 683217 16911 13857 11489 1944 925 chr20
## 2 ENSG00000000457 SCYL3 55202 145707 3320 3051 6389 1091 0 chr1
## Start End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 1 50934867 50958555 1207 DPM1 32612 550753 205876
## 2 169849631 169894267 5586 SCYL3 19382 241470 81828
## Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 1 213916 110076 27030 9424 31439 11798 90418 32660
## 2 113964 45380 15370 7634 16463 10702 40361 16518
## Inh_Vip Chr_10x Start_10x End_10x gene_length_10x Exc_10x Inh_10x
## 1 67887 chr20 50934867 50958555 1207 1080621 217995
## 2 32396 chr1 169849631 169894267 5586 482642 104645
cell_types = c("Astro", "Exc", "Inh", "Micro", "Oligo", "OPC")
cell_types_10x = paste0(cell_types, "_10x")
sig_sm2 = dat_combined[,cell_types]
sig_10x = dat_combined[,cell_types_10x]
dim(sig_sm2)## [1] 19179 6
sig_sm2[1:2,]## Astro Exc Inh Micro Oligo OPC
## 1 11489 683217 301355 1944 16911 13857
## 2 6389 145707 55202 1091 3320 3051
dim(sig_10x)## [1] 19179 6
sig_10x[1:2,]## Astro_10x Exc_10x Inh_10x Micro_10x Oligo_10x OPC_10x
## 1 32612 1080621 217995 9424 31439 11798
## 2 19382 482642 104645 7634 16463 10702
rownames(sig_sm2) = dat_combined$ENSG
rownames(sig_10x) = dat_combined$ENSG
length(sm2_markers)## [1] 693
sm2_markers[1:5]## [1] "ABAT" "ABCA2" "ABCA8" "ABCC4" "ACACB"
table(sm2_markers %in% dat_combined$gene)##
## FALSE TRUE
## 23 670
markers = dat_combined$ENSG[dat_combined$gene %in% sm2_markers]
length(markers)## [1] 670
stopifnot(all(markers %in% rownames(sig_sm2)))
stopifnot(all(markers %in% rownames(bulk_tpm)))
sig_sm2_tpm = get_tpm(sig_sm2, dat_combined$gene_length)
sig_10x_tpm = get_tpm(sig_10x, dat_combined$gene_length)
sig_sm2_tpm = sig_sm2_tpm[match(markers, rownames(sig_sm2_tpm)),]
sig_10x_tpm = sig_10x_tpm[match(markers, rownames(sig_10x_tpm)),]
bulk_tpm = bulk_tpm[match(markers, rownames(bulk_tpm)),]
dim(bulk_tpm)## [1] 670 174
dim(sig_sm2_tpm)## [1] 670 6
res_sm2 = ICeDT(bulk_tpm, sig_sm2_tpm, tumorPurity=rep(0,ncol(bulk_tpm)))## Adding 1e-5 to Y to ensure valid log transformation.
## Adding 1e-5 to Z to ensure valid log transformation.
## Iter 1: max diff of rho: 0.341202055892905
## .........
res_10x = ICeDT(bulk_tpm, sig_10x_tpm, tumorPurity=rep(0,ncol(bulk_tpm)))## Adding 1e-5 to Y to ensure valid log transformation.
## Iter 1: max diff of rho: 0.635354680664219
## ........
rho_sm2 = t(res_sm2$rho)
rho_10x = t(res_10x$rho)
dim(rho_sm2)## [1] 174 7
rho_sm2[1:2,]## tumor Astro Exc Inh Micro Oligo
## GTEX-1192X 0 0.2004574 0.6367593 0.0732894 0.016181474 0.03112912
## GTEX-11DYG 0 0.1021826 0.5752583 0.1400363 0.005418761 0.10679713
## OPC
## GTEX-1192X 0.04218334
## GTEX-11DYG 0.07030687
dim(rho_10x)## [1] 174 7
rho_10x[1:2,]## tumor Astro_10x Exc_10x Inh_10x Micro_10x Oligo_10x
## GTEX-1192X 0 0.2541224 0.4935282 0.09500614 0.04143025 0.05945821
## GTEX-11DYG 0 0.1440168 0.4220597 0.15709661 0.01607289 0.17249499
## OPC_10x
## GTEX-1192X 0.05645483
## GTEX-11DYG 0.08825899
rhos = cbind(rho_sm2[,-1], rho_10x[,-1])
rhos = as.data.frame(rhos)
dim(rhos)## [1] 174 12
glist = list()
cols = brewer.pal(n = length(cell_types), name = "Dark2")
for(i in 1:length(cell_types)){
cr_i = cor(rhos[[cell_types[i]]],
rhos[[cell_types_10x[i]]],
method="spearman")
glist[[cell_types[i]]] =
ggplot(rhos, aes_string(x=cell_types[i],
y=cell_types_10x[i])) +
geom_point(alpha=0.5, col=cols[i]) +
labs(title=sprintf("%s corr=%.3f", cell_types[i], cr_i),
x = "SMART-seq reference",
y = "10x reference")
}
ggarrange(plotlist=glist, nrow = 2, ncol = 3)gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4614209 246.5 7367025 393.5 NA 7367025 393.5
## Vcells 20210080 154.2 63997930 488.3 32768 63997930 488.3
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0 RColorBrewer_1.1-2
## [4] ggpointdensity_0.1.0 viridis_0.6.2 viridisLite_0.4.0
## [7] ggpubr_0.4.0 ggplot2_3.4.0 ICeDT_0.99.1
## [10] gtools_3.9.2 alabama_2022.4-1 numDeriv_2016.8-1.1
## [13] org.Hs.eg.db_3.14.0 AnnotationDbi_1.56.2 IRanges_2.28.0
## [16] S4Vectors_0.32.3 Biobase_2.54.0 BiocGenerics_0.40.0
## [19] stringr_1.4.0 data.table_1.14.2
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 sass_0.4.0 tidyr_1.1.4
## [4] bit64_4.0.5 jsonlite_1.7.2 carData_3.0-4
## [7] bslib_0.3.1 assertthat_0.2.1 highr_0.9
## [10] blob_1.2.2 GenomeInfoDbData_1.2.7 yaml_2.2.1
## [13] pillar_1.6.4 RSQLite_2.2.8 backports_1.4.0
## [16] glue_1.6.2 digest_0.6.29 XVector_0.34.0
## [19] ggsignif_0.6.3 colorspace_2.0-2 plyr_1.8.6
## [22] cowplot_1.1.1 htmltools_0.5.2 pkgconfig_2.0.3
## [25] broom_0.7.10 zlibbioc_1.40.0 purrr_0.3.4
## [28] scales_1.2.1 tibble_3.1.6 KEGGREST_1.34.0
## [31] farver_2.1.0 generics_0.1.1 car_3.0-12
## [34] ellipsis_0.3.2 cachem_1.0.6 withr_2.5.0
## [37] cli_3.4.1 magrittr_2.0.1 crayon_1.4.2
## [40] memoise_2.0.1 evaluate_0.14 fansi_0.5.0
## [43] rstatix_0.7.0 tools_4.1.2 lifecycle_1.0.3
## [46] munsell_0.5.0 Biostrings_2.62.0 compiler_4.1.2
## [49] jquerylib_0.1.4 rlang_1.0.6 grid_4.1.2
## [52] RCurl_1.98-1.5 rstudioapi_0.13 labeling_0.4.2
## [55] bitops_1.0-7 rmarkdown_2.11 gtable_0.3.0
## [58] abind_1.4-5 DBI_1.1.1 reshape2_1.4.4
## [61] R6_2.5.1 gridExtra_2.3 knitr_1.36
## [64] dplyr_1.0.7 fastmap_1.1.0 bit_4.0.4
## [67] utf8_1.2.2 stringi_1.7.6 Rcpp_1.0.7
## [70] vctrs_0.5.1 png_0.1-7 tidyselect_1.1.1
## [73] xfun_0.28